{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "缺失特征处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.697     , 0.17466667],\n",
       "       [0.774     , 0.376     ],\n",
       "       [0.666     , 0.091     ],\n",
       "       [0.245     , 0.057     ]])"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.experimental import enable_iterative_imputer\n",
    "from sklearn.impute import SimpleImputer, IterativeImputer\n",
    "\n",
    "X = np.array([\n",
    "    [1, '青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, np.nan],\n",
    "    [2, '乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.774, 0.376],\n",
    "    [3, '乌黑', '稍蜷', '沉闷', '-', '稍凹', '硬滑', 0.666, 0.091],\n",
    "    [4, '浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057],\n",
    "])\n",
    "imp_mean = SimpleImputer(strategy='mean')\n",
    "imp_mean.fit_transform(X[:, 7:9]) # 用均值填充\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.697, 0.091],\n",
       "       [0.774, 0.376],\n",
       "       [0.666, 0.091],\n",
       "       [0.245, 0.057]])"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "imp_median = SimpleImputer(strategy='median')\n",
    "imp_median.fit_transform(X[:, 7:9]) # 用中位数填充"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑'],\n",
       "       ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑'],\n",
       "       ['乌黑', '稍蜷', '沉闷', '清晰', '稍凹', '硬滑'],\n",
       "       ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑']], dtype=object)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "imp_frequent = SimpleImputer(missing_values='-', strategy='most_frequent')\n",
    "imp_frequent.fit_transform(X[:, 1:7].astype('object')) # 用众数填充"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.697     , 0.20908713],\n",
       "       [0.774     , 0.376     ],\n",
       "       [0.666     , 0.091     ],\n",
       "       [0.245     , 0.057     ]])"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 回归器默认采用BayesianRidge 也可选DecisionTreeRegressor ExtraTreesRegressor KNeighborsRegressor\n",
    "imp_iter = IterativeImputer()\n",
    "imp_iter.fit_transform(X[:, 7:9])"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
